Team rotation #1

library(tidyverse)
library(corrplot)
library(ggfortify)
library(reshape2)
library(cowplot)
library(viridis)
library(RColorBrewer)
library(ggfortify)
library(pals)

-Essential paper to read
-Very useful paper about phenocams and chromatic coordinates
-raw image data if interested
-metadata url
-GRSM PhenoCam URL -GRSM PhenoCam ROI URL with ROI mask file
-GRSM gcc data csv URL
-Site to download imagedata

rm(list=ls())

# Claire's Rserver WD
setwd("/home/claire/Git/PhenoRepo/")

site <- "GRSM"

# Individual data--data for every image taken
ind_data <- read_csv(paste0("data/",site,"/",site,"_individual.csv"))
ind_data <- ind_data[,colSums(is.na(ind_data))<nrow(ind_data)]
# head(ind_data)
# summary(ind_data)

# Daily data--data for images aggregated by day
daily_data <- read_csv(paste0("data/",site,"/",site,"_daily.csv")) %>% 
  filter(image_count>0)
daily_data <- daily_data[,colSums(is.na(daily_data))<nrow(daily_data)]
# head(daily_data)
# summary(daily_data)

# GDD data -- response variable
gcc_data <- read_csv(paste0("data/",site,"/",site,"_gccTargets.csv"))
# head(gcc_data)
# summary(gcc_data)

gcol <- 'springgreen4'
rcol <- 'sienna3'
bcol <- 'dodgerblue3'
doy_pal <- colorRampPalette(kovesi.cyclic_mrybm_35_75_c68(366))(366)

Individual data – looking at time series plots of data for individual photos

Histograms of rcc and gcc values

# names(ind_data)

hist(ind_data$gcc,breaks=50); abline(v=mean(ind_data$gcc,na.rm=T),col='green')

hist(ind_data$rcc,breaks=50); abline(v=mean(ind_data$gcc,na.rm=T),col='red')

GCC hist: Sort of multi-modal – one small peak around 0.10, very high peak around 0.35, and moderate but broader peak around 0.42. Lets try to match these peaks up with times of the year?

RCC: One non-normal peak around 0.37, slight left skew to data.

Adding horizontal line between plots/plot notes sets

Correlation between the three color channels are in columns in the individual photo data set From meta data URL “correlation coefficient (across pixels) between red channel DN and green channel DN, over the ROI”

# histograms of correlation columns with mean as vertical red line
rgcorrmean <- mean(ind_data$r_g_correl,na.rm=T)
hist(ind_data$r_g_correl,main = paste("mean R-G corr = ", round(rgcorrmean,4))); abline(v=rgcorrmean,col='red')

gbcorrmean <- mean(ind_data$g_b_correl,na.rm=T)
hist(ind_data$g_b_correl,main = paste("mean G-B corr = ", round(gbcorrmean,4))); abline(v=gbcorrmean,col='red')

brcorrmean <- mean(ind_data$b_r_correl,na.rm=T)
hist(ind_data$b_r_correl,main = paste("mean B-R corr = ", round(brcorrmean,4))); abline(v=brcorrmean,col='red')

Of correlations between three color channels, highest is R-G (red green), then G-B, then B-R.
- Are there useful biological takeaways from this? What does it mean to say the G-B and are correlated? - This also doesn’t tell us pos. or neg. correlations. My intuition says that R is negative corr. with B and G, and G and B are positively correlated.


Plot of mean R, G, and B values against time.

#names(ind_data)

ind_data_melt <- melt(ind_data,id.vars = 'date',measure.vars = c('g_mean',"r_mean","b_mean"))

ggplot(ind_data_melt,aes(x=date,y=value))+
  geom_point(aes(color=variable),alpha=0.05,size=1)+
  facet_wrap(~variable)+
  labs(x="Year", y="Mean channel pixel value")+
  scale_color_manual(values=c(gcol,rcol,bcol))+
  theme_classic()
## Warning: Removed 30 rows containing missing values (geom_point).


# names(ind_data)
ind_data_melt2 <- melt(ind_data,id.vars = 'solar_elev',measure.vars = c('gcc',"rcc"))

ggplot(ind_data_melt2,aes(x=solar_elev,y=value))+
  geom_point(aes(color=variable),alpha=0.01)+
  geom_smooth(method='lm')+
  facet_wrap(~variable)+
  scale_color_manual(values=c(gcol,rcol))+
  theme_bw()
## Warning: Removed 20 rows containing non-finite values (stat_smooth).
## Warning: Removed 20 rows containing missing values (geom_point).

Sort of see 3 interesting groups in the green again, maybe related to the three peaks in the histogram


Daily data – Looking at time series plots of data for photos aggregated by day.
This is a plot of r mean, g mean, and b mean across time
-‘Mean’ explanation: “the mean value (for all images passing the selection criteria) of the mean (by image) red channel DN over the ROI”
- DN = digital number
- “Thus, each pixel in the image is associated with a digital number (“DN”) triplet, with each element in the triplet corresponding to the intensity of one of the colour layers. Therefore, the second step in the image analysis was to read in the images, and associated mask sequence, and to characterize the frequency distribution of the RGB DN triplets within the ROI. We did this separately for each ROI at each site, to produce the “all-image” data files contained in Data Record 3 (see below)."

# names(ind_data)

ggplot(daily_data,aes(x=date)) +
  geom_point(aes(y=r_mean),color=rcol,alpha=0.5)+
  geom_point(aes(y=g_mean),color=gcol,alpha=0.5)+
  geom_point(aes(y=b_mean),color=bcol,alpha=0.5)+
  geom_vline(xintercept = as.Date('2017-01-01'),color='grey50',alpha=0.5)+
  geom_vline(xintercept = as.Date('2018-01-01'),color='grey50',alpha=0.5)+
  geom_vline(xintercept = as.Date('2019-01-01'),color='grey50',alpha=0.5)+
  geom_vline(xintercept = as.Date('2020-01-01'),color='grey50',alpha=0.5)+
  geom_vline(xintercept = as.Date('2021-01-01'),color='grey50',alpha=0.5)+
  labs(y="mean daily pixel value for each channel")+
  theme_classic()


Plot gcc and rcc over time.
cc = chromatic coordinate = " the ratio of the amount of one primary color to the total amount of all three necessary to reproduce a given color."

ggplot(daily_data,aes(x=date))+
  geom_vline(xintercept = as.Date('2017-01-01'),color='grey50',alpha=0.5)+
  geom_vline(xintercept = as.Date('2018-01-01'),color='grey50',alpha=0.5)+
  geom_vline(xintercept = as.Date('2019-01-01'),color='grey50',alpha=0.5)+
  geom_vline(xintercept = as.Date('2020-01-01'),color='grey50',alpha=0.5)+
  geom_vline(xintercept = as.Date('2021-01-01'),color='grey50',alpha=0.5)+
  geom_point(aes(y=rcc_90),color=rcol,alpha=0.5)+
  geom_point(aes(y=gcc_90),color=gcol,alpha=0.5)+
  theme_classic()


Look more at gcc through time

gcc_df <- daily_data %>% 
  select(date,year,doy,
         midday_gcc,
         g_mean,
         gcc_mean,gcc_std,
         gcc_50,gcc_75,gcc_90)

ggplot(gcc_df,aes(x=date))+
  geom_point(aes(y=gcc_90,color=as.factor(year)))+
  theme_classic()


Maximum daily solar elevation vs gcc_90

# Look at max solar elev through time
mse_plot <- ggplot(daily_data,aes(x=date,y=max_solar_elev))+
  geom_vline(xintercept = as.Date('2017-01-01'),color='grey50',alpha=0.5)+
  geom_vline(xintercept = as.Date('2018-01-01'),color='grey50',alpha=0.5)+
  geom_vline(xintercept = as.Date('2019-01-01'),color='grey50',alpha=0.5)+
  geom_vline(xintercept = as.Date('2020-01-01'),color='grey50',alpha=0.5)+
  geom_vline(xintercept = as.Date('2021-01-01'),color='grey50',alpha=0.5)+
  geom_smooth()+
  # geom_line()+ # Weird where missing values
  theme_classic()

gcc_plot <- ggplot(daily_data,aes(x=date,y=gcc_90))+
  geom_vline(xintercept = as.Date('2017-01-01'),color='grey50',alpha=0.5)+
  geom_vline(xintercept = as.Date('2018-01-01'),color='grey50',alpha=0.5)+
  geom_vline(xintercept = as.Date('2019-01-01'),color='grey50',alpha=0.5)+
  geom_vline(xintercept = as.Date('2020-01-01'),color='grey50',alpha=0.5)+
  geom_vline(xintercept = as.Date('2021-01-01'),color='grey50',alpha=0.5)+
  geom_line(color=gcol)+
  theme_classic()

plot_grid(mse_plot,gcc_plot,ncol = 1)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

# Try plotting together
ggplot(daily_data,aes(x=date))+
  geom_vline(xintercept = as.Date('2017-01-01'),color='grey50',alpha=0.5)+
  geom_vline(xintercept = as.Date('2018-01-01'),color='grey50',alpha=0.5)+
  geom_vline(xintercept = as.Date('2019-01-01'),color='grey50',alpha=0.5)+
  geom_vline(xintercept = as.Date('2020-01-01'),color='grey50',alpha=0.5)+
  geom_vline(xintercept = as.Date('2021-01-01'),color='grey50',alpha=0.5)+
  geom_point(aes(y=log(max_solar_elev)/10),size=0.5,color='goldenrod3')+
  geom_line(aes(y=gcc_90),color=gcol)+
  labs(y="",title="GCC_90 and log(solar elev)/10 vs. time",subtitle = "Max solar elev transformed to compare peaks more easily")+
  # geom_line()+ # Weird where missing values
  theme_classic()

ggplot(daily_data,aes(y=gcc_90,x=max_solar_elev))+
  geom_point(aes(color=doy),alpha=0.5)+
  scale_color_gradientn(colors=doy_pal)+
  geom_smooth(method='lm')+
  theme_classic()+
  labs(color="Day of year")
## `geom_smooth()` using formula 'y ~ x'

summary(lm(gcc_90~max_solar_elev+doy,data = daily_data))
## 
## Call:
## lm(formula = gcc_90 ~ max_solar_elev + doy, data = daily_data)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.042906 -0.010036 -0.001024  0.012223  0.093129 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    2.662e-01  1.804e-03  147.54   <2e-16 ***
## max_solar_elev 1.927e-03  2.736e-05   70.43   <2e-16 ***
## doy            8.425e-05  4.244e-06   19.85   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01709 on 1462 degrees of freedom
## Multiple R-squared:  0.7768, Adjusted R-squared:  0.7765 
## F-statistic:  2544 on 2 and 1462 DF,  p-value: < 2.2e-16

Correlation plots

names(daily_data)
##  [1] "date"            "year"            "doy"             "image_count"    
##  [5] "midday_filename" "midday_r"        "midday_g"        "midday_b"       
##  [9] "midday_gcc"      "midday_rcc"      "r_mean"          "r_std"          
## [13] "g_mean"          "g_std"           "b_mean"          "b_std"          
## [17] "gcc_mean"        "gcc_std"         "gcc_50"          "gcc_75"         
## [21] "gcc_90"          "rcc_mean"        "rcc_std"         "rcc_50"         
## [25] "rcc_75"          "rcc_90"          "max_solar_elev"
corr_df <- daily_data %>% 
  select(1,2,6:27)
summary(corr_df)
##       date                 year         midday_r         midday_g     
##  Min.   :2017-01-24   Min.   :2017   Min.   : 61.64   Min.   : 58.72  
##  1st Qu.:2018-03-02   1st Qu.:2018   1st Qu.: 94.28   1st Qu.: 95.15  
##  Median :2019-03-03   Median :2019   Median :103.49   Median :100.03  
##  Mean   :2019-03-08   Mean   :2019   Mean   :102.83   Mean   :100.75  
##  3rd Qu.:2020-03-19   3rd Qu.:2020   3rd Qu.:111.79   3rd Qu.:106.69  
##  Max.   :2021-03-20   Max.   :2021   Max.   :142.95   Max.   :140.65  
##     midday_b        midday_gcc       midday_rcc         r_mean      
##  Min.   : 29.68   Min.   :0.3209   Min.   :0.2815   Min.   : 81.36  
##  1st Qu.: 51.21   1st Qu.:0.3476   1st Qu.:0.3659   1st Qu.: 92.16  
##  Median : 58.66   Median :0.3852   Median :0.3863   Median :101.86  
##  Mean   : 60.88   Mean   :0.3824   Mean   :0.3887   Mean   :101.46  
##  3rd Qu.: 69.50   3rd Qu.:0.4153   3rd Qu.:0.4107   3rd Qu.:109.16  
##  Max.   :109.42   Max.   :0.4562   Max.   :0.5345   Max.   :130.96  
##      r_std            g_mean           g_std            b_mean      
##  Min.   : 0.000   Min.   : 82.41   Min.   : 0.000   Min.   : 39.22  
##  1st Qu.: 6.764   1st Qu.: 96.61   1st Qu.: 4.341   1st Qu.: 55.69  
##  Median :10.033   Median :100.93   Median : 6.015   Median : 61.72  
##  Mean   :10.433   Mean   :101.05   Mean   : 6.179   Mean   : 63.95  
##  3rd Qu.:13.614   3rd Qu.:105.97   3rd Qu.: 7.861   3rd Qu.: 72.13  
##  Max.   :29.049   Max.   :118.85   Max.   :15.559   Max.   :110.77  
##      b_std           gcc_mean         gcc_std             gcc_50      
##  Min.   : 0.000   Min.   :0.3257   Min.   :0.000000   Min.   :0.3246  
##  1st Qu.: 5.253   1st Qu.:0.3475   1st Qu.:0.001500   1st Qu.:0.3475  
##  Median : 7.383   Median :0.3849   Median :0.004260   Median :0.3854  
##  Mean   : 7.647   Mean   :0.3807   Mean   :0.004827   Mean   :0.3812  
##  3rd Qu.: 9.546   3rd Qu.:0.4111   3rd Qu.:0.007060   3rd Qu.:0.4121  
##  Max.   :27.878   Max.   :0.4459   Max.   :0.028810   Max.   :0.4491  
##      gcc_75           gcc_90          rcc_mean         rcc_std       
##  Min.   :0.3275   Min.   :0.3291   Min.   :0.3149   Min.   :0.00000  
##  1st Qu.:0.3482   1st Qu.:0.3488   1st Qu.:0.3566   1st Qu.:0.01588  
##  Median :0.3898   Median :0.3924   Median :0.3782   Median :0.02340  
##  Mean   :0.3837   Mean   :0.3856   Mean   :0.3800   Mean   :0.02542  
##  3rd Qu.:0.4159   3rd Qu.:0.4190   3rd Qu.:0.3989   3rd Qu.:0.03432  
##  Max.   :0.4539   Max.   :0.4557   Max.   :0.5044   Max.   :0.06922  
##      rcc_50           rcc_75           rcc_90       max_solar_elev 
##  Min.   :0.3063   Min.   :0.3198   Min.   :0.3220   Min.   :12.14  
##  1st Qu.:0.3630   1st Qu.:0.3721   1st Qu.:0.3777   1st Qu.:38.10  
##  Median :0.3829   Median :0.3920   Median :0.3995   Median :53.55  
##  Mean   :0.3866   Mean   :0.3951   Mean   :0.4013   Mean   :54.11  
##  3rd Qu.:0.4087   3rd Qu.:0.4150   3rd Qu.:0.4202   3rd Qu.:70.63  
##  Max.   :0.5241   Max.   :0.5324   Max.   :0.5346   Max.   :77.69
corrplot(cor(corr_df[,2:ncol(corr_df)]))


PCA Plots

# names(daily_data)

pca_df <- daily_data %>% 
  dplyr::select(year,doy,
                # midday_r,midday_g,midday_b,
                # r_mean,g_mean,b_mean,
                # gcc_mean,rcc_mean,
                gcc_90,rcc_90,
                max_solar_elev)

pca_res <- prcomp(pca_df, scale. = TRUE)

autoplot(pca_res, data = daily_data, 
         colour = 'year',
         # colour='grey50',
         loadings = TRUE, loadings.label=TRUE,
         loadings.colour='grey50',loadings.label.colour='darkred',
         loadings.label.size = 5,alpha = 0.5)+
  theme_classic()+
  scale_color_viridis(option = "D")
## Warning: `select_()` was deprecated in dplyr 0.7.0.
## Please use `select()` instead.

Plot PC1/PC2 against other variables

PCs <- as.data.frame(pca_res$x)
PCs12_df <- cbind(PC1=PCs$PC1,PC2=PCs$PC2, daily_data)
# summary(PCs12_df)

ggplot(PCs12_df,aes(x=PC1,y=gcc_90))+
  geom_point(aes(color=doy),alpha=0.5)+
  scale_color_gradientn(colors=doy_pal)+
  geom_smooth(method="lm",se=T)+
  # geom_smooth(se=FALSE,color='red')+
  theme(legend.position = "none")+
  theme_classic()
## `geom_smooth()` using formula 'y ~ x'

https://www.sciencedirect.com/science/article/pii/S0378112716302298